import re
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, SVC
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from pandas.core.dtypes.common import is_string_dtype, is_numeric_dtype
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42)
label_encoder = LabelEncoder()
scaler=StandardScaler()
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
telecom_cust_churn1=pd.read_csv('TelcomCustomer-Churn_1.csv')
telecom_cust_churn1.head()
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No |
telecom_cust_churn1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null int64 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object dtypes: int64(2), object(8) memory usage: 550.4+ KB
TelcomCustomer-Churn_1 has 7043 data entries and 10 columns.
telecom_cust_churn2=pd.read_csv('TelcomCustomer-Churn_2.csv')
telecom_cust_churn2.head()
| customerID | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
telecom_cust_churn2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 OnlineBackup 7043 non-null object 2 DeviceProtection 7043 non-null object 3 TechSupport 7043 non-null object 4 StreamingTV 7043 non-null object 5 StreamingMovies 7043 non-null object 6 Contract 7043 non-null object 7 PaperlessBilling 7043 non-null object 8 PaymentMethod 7043 non-null object 9 MonthlyCharges 7043 non-null float64 10 TotalCharges 7043 non-null object 11 Churn 7043 non-null object dtypes: float64(1), object(11) memory usage: 660.4+ KB
TelcomCustomer-Churn_2 has 7043 data entries and 12 columns.
telecom_cust_churn=pd.merge(telecom_cust_churn1, telecom_cust_churn2, on='customerID')
telecom_cust_churn.head()
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
telecom_cust_churn.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null int64 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object 10 OnlineBackup 7043 non-null object 11 DeviceProtection 7043 non-null object 12 TechSupport 7043 non-null object 13 StreamingTV 7043 non-null object 14 StreamingMovies 7043 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null object dtypes: float64(1), int64(2), object(18) memory usage: 1.2+ MB
Merged dataset contains 7043 entries and 21 columns.
columns1=set(telecom_cust_churn1.columns.tolist().__add__(telecom_cust_churn2.columns.tolist()))
columns2=set(telecom_cust_churn.columns.tolist())
'Identical columns' if columns1==columns2 else 'Non-Identical columns'
'Identical columns'
print("Columns of unmerged Dataset:-", ", ".join(sorted(set(telecom_cust_churn1.columns.tolist().__add__(telecom_cust_churn2.columns.tolist())))))
print()
print("Columns of merged Dataset:-", ", ".join(sorted(telecom_cust_churn.columns.tolist())))
Columns of unmerged Dataset:- Churn, Contract, Dependents, DeviceProtection, InternetService, MonthlyCharges, MultipleLines, OnlineBackup, OnlineSecurity, PaperlessBilling, Partner, PaymentMethod, PhoneService, SeniorCitizen, StreamingMovies, StreamingTV, TechSupport, TotalCharges, customerID, gender, tenure Columns of merged Dataset:- Churn, Contract, Dependents, DeviceProtection, InternetService, MonthlyCharges, MultipleLines, OnlineBackup, OnlineSecurity, PaperlessBilling, Partner, PaymentMethod, PhoneService, SeniorCitizen, StreamingMovies, StreamingTV, TechSupport, TotalCharges, customerID, gender, tenure
telecom_cust_churn.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null int64 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object 10 OnlineBackup 7043 non-null object 11 DeviceProtection 7043 non-null object 12 TechSupport 7043 non-null object 13 StreamingTV 7043 non-null object 14 StreamingMovies 7043 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null object dtypes: float64(1), int64(2), object(18) memory usage: 1.2+ MB
telecom_cust_churn[telecom_cust_churn.duplicated()]
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn |
|---|
0 rows × 21 columns
No duplicates found in the dataset.
telecom_cust_churn.isnull().sum()
customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 0 Churn 0 dtype: int64
No empty data is found in the dataset.
columns=telecom_cust_churn.columns.drop(['customerID','tenure','MonthlyCharges','TotalCharges'])
for col in columns:
print('ColumnName:',col)
print(telecom_cust_churn[col].value_counts())
print(telecom_cust_churn[col].size==7043)
print()
ColumnName: gender Male 3555 Female 3488 Name: gender, dtype: int64 True ColumnName: SeniorCitizen 0 5901 1 1142 Name: SeniorCitizen, dtype: int64 True ColumnName: Partner No 3641 Yes 3402 Name: Partner, dtype: int64 True ColumnName: Dependents No 4933 Yes 2110 Name: Dependents, dtype: int64 True ColumnName: PhoneService Yes 6361 No 682 Name: PhoneService, dtype: int64 True ColumnName: MultipleLines No 3390 Yes 2971 No phone service 682 Name: MultipleLines, dtype: int64 True ColumnName: InternetService Fiber optic 3096 DSL 2421 No 1526 Name: InternetService, dtype: int64 True ColumnName: OnlineSecurity No 3498 Yes 2019 No internet service 1526 Name: OnlineSecurity, dtype: int64 True ColumnName: OnlineBackup No 3088 Yes 2429 No internet service 1526 Name: OnlineBackup, dtype: int64 True ColumnName: DeviceProtection No 3095 Yes 2422 No internet service 1526 Name: DeviceProtection, dtype: int64 True ColumnName: TechSupport No 3473 Yes 2044 No internet service 1526 Name: TechSupport, dtype: int64 True ColumnName: StreamingTV No 2810 Yes 2707 No internet service 1526 Name: StreamingTV, dtype: int64 True ColumnName: StreamingMovies No 2785 Yes 2732 No internet service 1526 Name: StreamingMovies, dtype: int64 True ColumnName: Contract Month-to-month 3875 Two year 1695 One year 1473 Name: Contract, dtype: int64 True ColumnName: PaperlessBilling Yes 4171 No 2872 Name: PaperlessBilling, dtype: int64 True ColumnName: PaymentMethod Electronic check 2365 Mailed check 1612 Bank transfer (automatic) 1544 Credit card (automatic) 1522 Name: PaymentMethod, dtype: int64 True ColumnName: Churn No 5174 Yes 1869 Name: Churn, dtype: int64 True
As checked in dataset, TotalCharges needed to have float as dtype, need to analyse the data.
telecom_cust_churn.TotalCharges.sort_values()
936
3826
4380
753
5218
...
6646 997.75
5598 998.1
3686 999.45
3353 999.8
2845 999.9
Name: TotalCharges, Length: 7043, dtype: object
As TotalCharges contains empty values, which need to be imputed with some central tendency value.
telecom_cust_churn.TotalCharges=telecom_cust_churn.TotalCharges.replace(' ', np.nan)
telecom_cust_churn.TotalCharges=pd.to_numeric(telecom_cust_churn.TotalCharges, errors='coerce')
telecom_cust_churn.TotalCharges.fillna(telecom_cust_churn.TotalCharges.mean(),inplace = True)
telecom_cust_churn.TotalCharges.isnull().sum()
0
Imputed missing values with mean for TotalCharges columns.
telecom_cust_churn.dtypes
customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges float64 Churn object dtype: object
MonthlyCharges and TotalCharges have dtype as float64.
for col in telecom_cust_churn.select_dtypes(include='object').columns.drop('customerID'):
fig=px.pie(telecom_cust_churn, hole=0.3, values=telecom_cust_churn[col].value_counts(), names=telecom_cust_churn[col].value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title=col+' Column Representation')
fig.show()
fig=px.pie(telecom_cust_churn, hole=0.3, values=telecom_cust_churn.SeniorCitizen.value_counts(), names=telecom_cust_churn.SeniorCitizen.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='SeniorCitizen Column Representation')
fig.show();
Pie Chart Graphical Representation description:
Converting data having Yes/Male as 1 and No/Female as 0. Using get_dummies() function, converts categorical data into dummy or indicator variables.
# Duplicating dataset
telecom_cust_churn_dup=telecom_cust_churn.copy(deep=True)
telecom_cust_churn_dup.shape
(7043, 21)
telecom_cust_churn.drop(columns='customerID', inplace=True, axis=1)
# Encoding gender column
telecom_cust_churn.gender=[1 if gender=='Male' else 0 for gender in telecom_cust_churn.gender]
# Columns with Yes/No values
for col in ['Partner','Dependents','PhoneService','PaperlessBilling','Churn']:
telecom_cust_churn[col] = [1 if each == "Yes" else 0 for each in telecom_cust_churn[col]]
# Columns with get_dummies() usage
telecom_cust_churn = pd.get_dummies(telecom_cust_churn, columns=['InternetService','Contract', 'PaymentMethod', 'OnlineSecurity','MultipleLines','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies'], drop_first=False, prefix_sep='_')
telecom_cust_churn.head()
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | PaperlessBilling | MonthlyCharges | TotalCharges | Churn | ... | DeviceProtection_Yes | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 29.85 | 29.85 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 1 | 1 | 0 | 0 | 0 | 34 | 1 | 0 | 56.95 | 1889.50 | 0 | ... | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 2 | 1 | 0 | 0 | 0 | 2 | 1 | 1 | 53.85 | 108.15 | 1 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 3 | 1 | 0 | 0 | 0 | 45 | 0 | 0 | 42.30 | 1840.75 | 0 | ... | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 70.70 | 151.65 | 1 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
5 rows × 41 columns
telecom_cust_churn.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| gender | 7043.0 | 0.504756 | 0.500013 | 0.00 | 0.000 | 1.00 | 1.00 | 1.00 |
| SeniorCitizen | 7043.0 | 0.162147 | 0.368612 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| Partner | 7043.0 | 0.483033 | 0.499748 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| Dependents | 7043.0 | 0.299588 | 0.458110 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| tenure | 7043.0 | 32.371149 | 24.559481 | 0.00 | 9.000 | 29.00 | 55.00 | 72.00 |
| PhoneService | 7043.0 | 0.903166 | 0.295752 | 0.00 | 1.000 | 1.00 | 1.00 | 1.00 |
| PaperlessBilling | 7043.0 | 0.592219 | 0.491457 | 0.00 | 0.000 | 1.00 | 1.00 | 1.00 |
| MonthlyCharges | 7043.0 | 64.761692 | 30.090047 | 18.25 | 35.500 | 70.35 | 89.85 | 118.75 |
| TotalCharges | 7043.0 | 2283.300441 | 2265.000258 | 18.80 | 402.225 | 1400.55 | 3786.60 | 8684.80 |
| Churn | 7043.0 | 0.265370 | 0.441561 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| InternetService_DSL | 7043.0 | 0.343746 | 0.474991 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| InternetService_Fiber optic | 7043.0 | 0.439585 | 0.496372 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| InternetService_No | 7043.0 | 0.216669 | 0.412004 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| Contract_Month-to-month | 7043.0 | 0.550192 | 0.497510 | 0.00 | 0.000 | 1.00 | 1.00 | 1.00 |
| Contract_One year | 7043.0 | 0.209144 | 0.406726 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| Contract_Two year | 7043.0 | 0.240664 | 0.427517 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| PaymentMethod_Bank transfer (automatic) | 7043.0 | 0.219225 | 0.413751 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| PaymentMethod_Credit card (automatic) | 7043.0 | 0.216101 | 0.411613 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| PaymentMethod_Electronic check | 7043.0 | 0.335794 | 0.472301 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| PaymentMethod_Mailed check | 7043.0 | 0.228880 | 0.420141 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| OnlineSecurity_No | 7043.0 | 0.496663 | 0.500024 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| OnlineSecurity_No internet service | 7043.0 | 0.216669 | 0.412004 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| OnlineSecurity_Yes | 7043.0 | 0.286668 | 0.452237 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| MultipleLines_No | 7043.0 | 0.481329 | 0.499687 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| MultipleLines_No phone service | 7043.0 | 0.096834 | 0.295752 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| MultipleLines_Yes | 7043.0 | 0.421837 | 0.493888 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| OnlineBackup_No | 7043.0 | 0.438450 | 0.496232 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| OnlineBackup_No internet service | 7043.0 | 0.216669 | 0.412004 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| OnlineBackup_Yes | 7043.0 | 0.344881 | 0.475363 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| DeviceProtection_No | 7043.0 | 0.439443 | 0.496355 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| DeviceProtection_No internet service | 7043.0 | 0.216669 | 0.412004 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| DeviceProtection_Yes | 7043.0 | 0.343888 | 0.475038 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| TechSupport_No | 7043.0 | 0.493114 | 0.499988 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| TechSupport_No internet service | 7043.0 | 0.216669 | 0.412004 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| TechSupport_Yes | 7043.0 | 0.290217 | 0.453895 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| StreamingTV_No | 7043.0 | 0.398978 | 0.489723 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| StreamingTV_No internet service | 7043.0 | 0.216669 | 0.412004 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| StreamingTV_Yes | 7043.0 | 0.384353 | 0.486477 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| StreamingMovies_No | 7043.0 | 0.395428 | 0.488977 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
| StreamingMovies_No internet service | 7043.0 | 0.216669 | 0.412004 | 0.00 | 0.000 | 0.00 | 0.00 | 1.00 |
| StreamingMovies_Yes | 7043.0 | 0.387903 | 0.487307 | 0.00 | 0.000 | 0.00 | 1.00 | 1.00 |
Scaling 3 columns i.e., tenure, MonthlyCharges and MonthlyCharges, using z-score.
cols_to_scale = ["MonthlyCharges","TotalCharges","tenure"]
telecom_cust_churn[cols_to_scale]=scaler.fit_transform(telecom_cust_churn[cols_to_scale])
telecom_cust_churn.head()
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | PaperlessBilling | MonthlyCharges | TotalCharges | Churn | ... | DeviceProtection_Yes | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | -1.277445 | 0 | 1 | -1.160323 | -0.994971 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 1 | 1 | 0 | 0 | 0 | 0.066327 | 1 | 0 | -0.259629 | -0.173876 | 0 | ... | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 2 | 1 | 0 | 0 | 0 | -1.236724 | 1 | 1 | -0.362660 | -0.960399 | 1 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 3 | 1 | 0 | 0 | 0 | 0.514251 | 0 | 0 | -0.746535 | -0.195400 | 0 | ... | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | -1.236724 | 1 | 1 | 0.197365 | -0.941193 | 1 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
5 rows × 41 columns
telecom_cust_churn.Churn.value_counts()
0 5174 1 1869 Name: Churn, dtype: int64
Balancing Churn data set
telecom_cust_churn_balanced=telecom_cust_churn[telecom_cust_churn.Churn==0].sample(n=1869, random_state=42).append(telecom_cust_churn[telecom_cust_churn.Churn==1])
telecom_cust_churn_balanced.reset_index(drop=True, inplace=True)
telecom_cust_churn_balanced.Churn.value_counts()
0 1869 1 1869 Name: Churn, dtype: int64
X_train, X_test, y_train, y_test = train_test_split(telecom_cust_churn_balanced.drop(columns='Churn', axis=1),telecom_cust_churn_balanced.Churn, test_size=0.2, random_state=42)
X_train.shape, X_test.shape
((2990, 40), (748, 40))
model=XGBClassifier(random_state=42)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)
sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, fmt='.2f')
print('Train Accuracy Score:',accuracy_score(y_train,y_train_pred))
print('Test Accuracy Score:',accuracy_score(y_test,y_pred))
print('ROC AUC Score:',roc_auc_score(y_test,y_pred))
print("\nClassification Matrix:\n",classification_report(y_test, y_pred))
model_list= [['XGBClassifier', accuracy_score(y_train, y_train_pred), accuracy_score(y_test, y_pred), roc_auc_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred),f1_score(y_test, y_pred)]]
Train Accuracy Score: 0.9719063545150501
Test Accuracy Score: 0.7459893048128342
ROC AUC Score: 0.7462656684614338
Classification Matrix:
precision recall f1-score support
0 0.76 0.73 0.74 379
1 0.73 0.77 0.75 369
accuracy 0.75 748
macro avg 0.75 0.75 0.75 748
weighted avg 0.75 0.75 0.75 748
pd.DataFrame(model.feature_importances_,columns=['Imp'],index=X_train.columns).sort_values(by='Imp',ascending=False)
| Imp | |
|---|---|
| Contract_Month-to-month | 0.439045 |
| InternetService_Fiber optic | 0.174799 |
| OnlineSecurity_No | 0.030087 |
| Contract_One year | 0.026698 |
| StreamingMovies_Yes | 0.025702 |
| TechSupport_No | 0.023378 |
| Contract_Two year | 0.022211 |
| PaymentMethod_Electronic check | 0.015796 |
| StreamingTV_Yes | 0.014391 |
| tenure | 0.013428 |
| StreamingMovies_No | 0.013266 |
| PhoneService | 0.013258 |
| DeviceProtection_No | 0.012397 |
| OnlineBackup_No | 0.012224 |
| OnlineBackup_Yes | 0.011853 |
| InternetService_DSL | 0.010373 |
| PaymentMethod_Mailed check | 0.010370 |
| PaymentMethod_Bank transfer (automatic) | 0.010278 |
| MultipleLines_No | 0.009929 |
| MonthlyCharges | 0.009542 |
| InternetService_No | 0.009512 |
| SeniorCitizen | 0.009414 |
| Dependents | 0.009345 |
| TotalCharges | 0.009277 |
| PaperlessBilling | 0.009059 |
| MultipleLines_Yes | 0.008953 |
| PaymentMethod_Credit card (automatic) | 0.008905 |
| gender | 0.008417 |
| Partner | 0.008081 |
| StreamingTV_No | 0.006544 |
| OnlineSecurity_Yes | 0.004899 |
| DeviceProtection_Yes | 0.004841 |
| TechSupport_Yes | 0.003724 |
| OnlineBackup_No internet service | 0.000000 |
| DeviceProtection_No internet service | 0.000000 |
| MultipleLines_No phone service | 0.000000 |
| TechSupport_No internet service | 0.000000 |
| StreamingTV_No internet service | 0.000000 |
| StreamingMovies_No internet service | 0.000000 |
| OnlineSecurity_No internet service | 0.000000 |
Contract_Month-to-month feature has the highest importance in model.
There are 7 features that can be removed from our model, as they have 0 importance, these features are:
Removing features which have 0 importance.
X_rand_train, X_rand_test, y_rand_train, y_rand_test = train_test_split(telecom_cust_churn_balanced.drop(columns=['Churn','OnlineBackup_No internet service','DeviceProtection_No internet service','MultipleLines_No phone service','TechSupport_No internet service','StreamingTV_No internet service','StreamingMovies_No internet service','OnlineSecurity_No internet service'], axis=1),telecom_cust_churn_balanced.Churn, test_size=0.2, random_state=42)
X_rand_train.shape, X_rand_test.shape
((2990, 33), (748, 33))
params = {
'learning_rate':[0.05,0.10,0.15,0.20],
'max_depth':[3,4,5,6],
'min_child_weight':[1,3,5],
'gamma':[0.1,0.2,0.3,0.4],
'colsample_bytree':[0.3,0.4,0.5],
'n_estimators':[20,40,50,60]
}
rs_model=RandomizedSearchCV(model,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5, random_state=42)
rs_model.fit(X_rand_train,y_rand_train)
# print best parameter after tuning
print(rs_model.best_params_)
rs_model.best_estimator_.fit(X_rand_train,y_rand_train)
y_rand_pred = rs_model.best_estimator_.predict(X_rand_test)
y_rand_train_pred = rs_model.best_estimator_.predict(X_rand_train)
sns.heatmap(confusion_matrix(y_rand_test,y_rand_pred), annot=True, fmt='.2f')
print('Train Accuracy Score:',accuracy_score(y_rand_train,y_rand_train_pred))
print('Test Accuracy Score:',accuracy_score(y_rand_test,y_rand_pred))
print('ROC AUC Score:',roc_auc_score(y_rand_test,y_rand_pred))
print("\nClassification Matrix:\n",classification_report(y_rand_test, y_rand_pred))
model_list.append(['XGBClassifier_RandomizedSearchCV', accuracy_score(y_rand_train,y_rand_train_pred), accuracy_score(y_rand_test, y_rand_pred), roc_auc_score(y_rand_test,y_rand_pred), precision_score(y_rand_test,y_rand_pred), recall_score(y_rand_test,y_rand_pred), f1_score(y_rand_test,y_rand_pred)])
{'n_estimators': 50, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.15, 'gamma': 0.1, 'colsample_bytree': 0.5}
Train Accuracy Score: 0.802675585284281
Test Accuracy Score: 0.7807486631016043
ROC AUC Score: 0.7809954880551444
Classification Matrix:
precision recall f1-score support
0 0.80 0.76 0.78 379
1 0.77 0.80 0.78 369
accuracy 0.78 748
macro avg 0.78 0.78 0.78 748
weighted avg 0.78 0.78 0.78 748
pd.DataFrame(model_list,columns=['Model','Train_Accuracy','Test_Accuracy','ROC_AUC','Precision','Recall','F1 Score']).sort_values(by=['Recall','F1 Score'], ascending=False)
| Model | Train_Accuracy | Test_Accuracy | ROC_AUC | Precision | Recall | F1 Score | |
|---|---|---|---|---|---|---|---|
| 1 | XGBClassifier_RandomizedSearchCV | 0.802676 | 0.780749 | 0.780995 | 0.766234 | 0.799458 | 0.782493 |
| 0 | XGBClassifier | 0.971906 | 0.745989 | 0.746266 | 0.731266 | 0.766938 | 0.748677 |
pd.DataFrame(rs_model.best_estimator_.feature_importances_,columns=['Imp'],index=X_rand_train.columns).sort_values(by='Imp',ascending=False)
| Imp | |
|---|---|
| Contract_Month-to-month | 0.387774 |
| OnlineSecurity_No | 0.076351 |
| PaymentMethod_Electronic check | 0.074828 |
| Contract_Two year | 0.059022 |
| TechSupport_No | 0.045519 |
| OnlineSecurity_Yes | 0.035582 |
| InternetService_No | 0.031923 |
| InternetService_Fiber optic | 0.031788 |
| tenure | 0.023499 |
| Contract_One year | 0.021907 |
| StreamingMovies_Yes | 0.020114 |
| TotalCharges | 0.016778 |
| PaperlessBilling | 0.016757 |
| InternetService_DSL | 0.014821 |
| PaymentMethod_Bank transfer (automatic) | 0.014153 |
| MonthlyCharges | 0.013401 |
| StreamingTV_Yes | 0.011435 |
| MultipleLines_No | 0.010613 |
| DeviceProtection_No | 0.010299 |
| DeviceProtection_Yes | 0.010281 |
| MultipleLines_Yes | 0.008929 |
| TechSupport_Yes | 0.008168 |
| OnlineBackup_No | 0.008108 |
| PaymentMethod_Credit card (automatic) | 0.007684 |
| SeniorCitizen | 0.007103 |
| Dependents | 0.007018 |
| gender | 0.005976 |
| StreamingTV_No | 0.005315 |
| PaymentMethod_Mailed check | 0.005204 |
| PhoneService | 0.004909 |
| OnlineBackup_Yes | 0.004741 |
| Partner | 0.000000 |
| StreamingMovies_No | 0.000000 |
TechSupport_No feature has the highest importance in model. There are 2 features that can be removed from our model, as they have 0 importance, these features are:
After tuning some parameters using RandomizedSearchCV, accuracy has been hiked. Hyperparameters that are used {'n_estimators': 40, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.4, 'colsample_bytree': 0.4}. Train_Accuracy=80%, Test_Accuracy=78%
X_perf_train, X_perf_test, y_perf_train, y_perf_test = train_test_split(telecom_cust_churn_balanced.drop(columns=['Churn','OnlineBackup_No internet service','DeviceProtection_No internet service','MultipleLines_No phone service','TechSupport_No internet service','StreamingTV_No internet service','StreamingMovies_No internet service','OnlineSecurity_No internet service','OnlineSecurity_Yes','PhoneService'], axis=1),telecom_cust_churn_balanced.Churn, test_size=0.2, random_state=42)
X_perf_train.shape, X_perf_test.shape
((2990, 31), (748, 31))
# defining parameter range
param_grid = {
'learning_rate':[0.05,0.10,0.15,0.20],
'max_depth':[3,4,5,6],
'min_child_weight':[1,3,5],
'gamma':[0.1,0.2,0.3,0.4],
'colsample_bytree':[0.3,0.4,0.5],
'n_estimators':[20,40,50,60]
}
grid = GridSearchCV(XGBClassifier(random_state=42), param_grid,scoring='roc_auc',n_jobs=-1,cv=5,refit=True)
# fitting the model for grid search
grid.fit(X_perf_train,y_perf_train)
# print best parameter after tuning
print(grid.best_params_)
{'colsample_bytree': 0.3, 'gamma': 0.3, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 50}
grid.best_estimator_.fit(X_perf_train,y_perf_train)
y_perf_pred = grid.best_estimator_.predict(X_perf_test)
y_perf_train_pred = grid.best_estimator_.predict(X_perf_train)
sns.heatmap(confusion_matrix(y_test,y_perf_pred), annot=True, fmt='.2f')
print('Train Accuracy Score:',accuracy_score(y_perf_train,y_perf_train_pred))
print('Test Accuracy Score:',accuracy_score(y_perf_test,y_perf_pred))
print('ROC AUC Score:',roc_auc_score(y_perf_test,y_perf_pred))
print("\nClassification Matrix:\n",classification_report(y_perf_test, y_perf_pred))
model_list.append(['XGBClassifier_GridSearchCV', accuracy_score(y_perf_train,y_perf_train_pred), accuracy_score(y_perf_test, y_perf_pred), roc_auc_score(y_perf_test,y_perf_pred), precision_score(y_perf_test,y_perf_pred), recall_score(y_perf_test,y_perf_pred), f1_score(y_perf_test,y_perf_pred)])
Train Accuracy Score: 0.8096989966555184
Test Accuracy Score: 0.7740641711229946
ROC AUC Score: 0.7743276773137124
Classification Matrix:
precision recall f1-score support
0 0.79 0.75 0.77 379
1 0.76 0.79 0.78 369
accuracy 0.77 748
macro avg 0.77 0.77 0.77 748
weighted avg 0.77 0.77 0.77 748
pd.DataFrame(model_list,columns=['Model','Train_Accuracy','Test_Accuracy','ROC_AUC','Precision','Recall','F1 Score']).sort_values(by=['Recall','F1 Score'], ascending=False)
| Model | Train_Accuracy | Test_Accuracy | ROC_AUC | Precision | Recall | F1 Score | |
|---|---|---|---|---|---|---|---|
| 1 | XGBClassifier_RandomizedSearchCV | 0.802676 | 0.780749 | 0.780995 | 0.766234 | 0.799458 | 0.782493 |
| 2 | XGBClassifier_GridSearchCV | 0.809699 | 0.774064 | 0.774328 | 0.759067 | 0.794038 | 0.776159 |
| 0 | XGBClassifier | 0.971906 | 0.745989 | 0.746266 | 0.731266 | 0.766938 | 0.748677 |
Different performance activities and params has been tuned, which has increased accuracy with recall and F1 score
def load_read_dataset(filename, names=None, delimiter=',', usecol=None):
"""
load_read_dataset function reads file and load into dataframe for further analysis.
:param filename: take name of file as input, ex: 'credit.csv'
:param names: if file didn't contain any column names or if user want to provide different columns, ex: ['purpose','amount','age','default']
:param delimiter: by default .csv file has comma(,) as separator, but user can provide different separator also, ex: ','
:param usecol: param you can select columns to load from the CSV file, ex: ['age','default']
:return: dataframe object is returned.
"""
print("Reading dataset filename:{filename}, names:{names}, delimiter:{delimiter}, usecol:{usecol}".format(filename=filename, names=names, delimiter=delimiter, usecol=usecol))
# Reading .csv file with custom inputs
df1 = pd.read_csv(filename, sep=delimiter, names=names, usecols=usecol)
# Printing total number of entries with features
print(f"Dataset have {df1.shape[0]} entries with {df1.shape[1]} features.\n")
# print dataframe information
print('Dataframe Information:')
print(df1.dtypes)
# checking random 5 data from dataset
print('\n5 Sample set from dataframe:')
print(df1.sample(5))
print('Dataset loaded successfully.')
return df1
def check_duplicate(df1):
"""
check_duplicate function eliminates duplicate from dataframe if present.
:param df1: Pandas dataframe object
:return: dataframe object is returned.
"""
print('Duplicate check analysis started.')
df_dup = df1[df1.duplicated()]
if df_dup.shape[0]!=0:
print('\nDuplicate founded:\n')
print(df_dup)
print('\nRemoving duplicates\n')
df1.drop_duplicates(inplace=True)
print('Duplicate check analysis completed.')
return df1
def empty_data_check(df1):
"""
empty_data_check function check for null data values. This function also provides user to define structure for substitution of values.
:param df1: Pandas dataframe object
:return: dataframe object is returned.
"""
print('Empty data check analysis started.')
column_names=df1.columns
for col_name in column_names:
if is_numeric_dtype(df1[col_name]) and df1[col_name].isnull().sum()>0:
df1[col_name].fillna(df1[col_name].median())
if is_string_dtype(df1[col_name]):
val=df1[col_name].mode()[0]
df1[col_name].fillna(val)
df1[col_name]=df1[col_name].replace('', val)
print('Empty data check analysis completed.')
return df1
def object_numeric_type_conversion(df1, object_numeric_names):
"""
object_numeric_type_conversion function convert object dtype to numeric dtype.
:param df1: Pandas dataframe object
:param object_numeric_names: all column names that need to be converted to numeric dtype
:return: dataframe object is returned.
"""
print('Object to numeric conversion started')
def check(val):
if not(val and val.strip()):
return np.nan
return ''.join(re.findall(r'\d+', val))
for col_name in object_numeric_names:
df1[col_name]=df1[col_name].apply(check)
df1[col_name]=pd.to_numeric(df1[col_name], errors='coerce')
df1[col_name].fillna(df1[col_name].mean(),inplace = True)
print('Object to numeric conversion completed')
return df1
def outlier_check(df1):
"""
outlier_check function check outliers if present, then substitute with the IQR values.
:param df1: Pandas dataframe object
:return: dataframe object is returned.
"""
print('Outlier check analysis started.')
column_names=df1.columns
for col_name in column_names:
if is_numeric_dtype(df1[col_name]):
q1=np.quantile(df1[col_name], 0.25)
q3=np.quantile(df1[col_name], 0.75)
cut_off=1.5*(q3-q1)
right_whisker= q3 + cut_off
left_whiskers=q1 - cut_off
#Replace every outlier on the upper side by the upper whisker
for i in np.where(df1[col_name] > right_whisker)[0]:
df1.loc[i,col_name] = right_whisker
#Replace every outlier on the lower side by the lower whisker
for i in np.where(df1[col_name] < left_whiskers)[0]:
df1.loc[i,col_name] = left_whiskers
print('Outlier check analysis completed')
return df1
def pre_processing(df1, object_numeric_names):
"""
pre_processing function checks for duplicates, empty data and outliers.
:param df1: Pandas dataframe object
:param object_numeric_names: all column names that need to be converted to numeric dtype
:return: dataframe object is returned.
"""
print('Pre-processing analysis started')
df1=check_duplicate(df1)
df1=empty_data_check(df1)
df1=object_numeric_type_conversion(df1, object_numeric_names)
df1=outlier_check(df1)
print('Pre-processing analysis completed')
return df1
def visualization(df1, pie_col_names):
"""
visualization function represents 3 variation of graph i.e. pie chart, pair-plot and heat map.
:param df1: Pandas dataframe object
:param pie_col_names: names of column that need to be represented in pie chart
"""
print('Data Visualization with different features.')
print('Pie Chart Visualization')
for pie_col_name in pie_col_names:
pie_graph=px.pie(df1, hole=0.3, values=df1[pie_col_name].value_counts(), names=df1[pie_col_name].value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title=pie_col_name+' Column Representation')
pie_graph.show()
print('Pair-Plot Visualization')
sns.pairplot(df1)
plt.show()
print('HeatMap Visualization')
sns.heatmap(df1.corr(), annot=True, fmt='.2f')
plt.show()
def data_conversion(df1, replace_struct, cat_numeric_name, one_hot_encoder_name, standardization_col_name):
"""
data_conversion function convert data as per the need.
:param df1: Pandas dataframe object
:param replace_struct: user defined structure to update values in dataset, ex: {
"checking_balance":{"< 0 DM":1,"1-200 DM":2,"> 200 DM":3},"job":{"unemployed":1,"unskilled":2,"skilled":3} }
:param cat_numeric_name: names of column that needed to be converted from category type to numeric type
:param one_hot_encoder_name: names of column that convert categorical variable into dummy/indicator variables.
:param standardization_col_name: names of column that standardize features by removing the mean and scaling to unit variance.
:return: dataframe object is returned.
"""
print('Data conversion on the basis of LabelEncoder or OneHotEncoder or user defined structure')
if cat_numeric_name:
for col_name in cat_numeric_name:
df1[col_name]=label_encoder.fit_transform(df1[col_name])
if replace_struct:
df1=df1.replace(replace_struct)
if one_hot_encoder_name:
df1=pd.get_dummies(df1, columns=one_hot_encoder_name)
if standardization_col_name:
df1[standardization_col_name]=scaler.fit_transform(df1[standardization_col_name])
print('Data conversion completed.')
return df1
def features_removal(df1, feature_names):
"""
features_removal function removes features that aren't needed for model training.
:param df1: Pandas dataframe object
:param feature_names: names of column that need to be removed.
:return: dataframe object is returned.
"""
print('Eliminating unwanted features for model training')
df1.drop(columns=feature_names, axis=1, inplace=True)
def train_model(df1, is_regression, target_value, test_size_percent):
"""
train_model function removes features that aren't needed for model training.
:param df1: Pandas dataframe object
:param is_regression: model to be trained as regressor or classifier
:param target_value: target output
:param test_size_percent: the proportion of the dataset to include in the test split.
:return list of trained models with metrics
"""
model_details,param_details,train_model_lists={},{},[]
X,y=sm.fit_resample(df1.drop(target_value, axis=1), df1[target_value])
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=test_size_percent/100, random_state=42)
if is_regression:
model_details={
'Linear_Regression': LinearRegression(),
'Ridge_Regression': Ridge(random_state=42),
'Lasso_Regression': Lasso(random_state=42),
'KNN_Regression': KNeighborsRegressor(),
'SVM_Regression': SVR(),
'DecisionTree_Regression': DecisionTreeRegressor(random_state=42),
'RandomForest_Regression': RandomForestRegressor(random_state=42),
'GradientBoosting_Regression': GradientBoostingRegressor(random_state=42),
'AdaBoost_Regression': AdaBoostRegressor(random_state=42),
'XGB_Regression': XGBRegressor(random_state=42)
}
param_details={
'Linear_Regression': {},
'Ridge_Regression': {'alpha': np.logspace(-8, 8, 100), 'fit_intercept': [True, False], 'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']},
'Lasso_Regression': {'alpha': np.logspace(-8, 8, 100)},
'KNN_Regression':{'n_neighbors': np.arange(1,int(np.sqrt(len(X_train)))).tolist(),'p': [1,2],'weights': ['uniform','distance']},
'SVM_Regression': {'kernel':['linear','rbf'],'C':[0.1,1,10,100],'gamma':[1,0.1,0.001]},
'DecisionTree_Regression': {'criterion':['squared_error','friedman_mse','absolute_error','poisson'],'min_samples_split':[10,20,40],'max_depth':[2,6,8],'min_samples_leaf':[20,40,100],'max_leaf_nodes':[5,20,100]},
'RandomForest_Regression': {'max_depth': [5,10,None], 'max_features': ['sqrt','log2',None], 'n_estimators': [5,6,7,8,9,10,11,12,13,15]},
'GradientBoosting_Regression': {'learning_rate': [0.05,0.10,0.15],'min_samples_split': np.linspace(0.1,0.5,6),'min_samples_leaf': np.linspace(0.1,0.5,6),'max_depth':[3,5,8],'max_features':['log2','sqrt'],'subsample':[0.5,1.0],'n_estimators':[20,40,50,60]},
'AdaBoost_Regression': {'n_estimators': [20,40,50,60],'learning_rate': [0.05,0.10,0.15]},
'XGB_Regression': {'learning_rate':[0.05,0.10,0.15],'max_depth':[3,4,5,6],'min_child_weight':[1,3,5],'gamma':[0.1,0.2,0.3],'colsample_bytree':[0.3,0.4,0.5],'n_estimators':[20,40,50,60]}
}
else:
model_details={
'KNN_Classifier': KNeighborsClassifier(),
'SVM_Classifier': SVC(random_state=42),
'XGB_Classifier': XGBClassifier(random_state=42),
'GaussianNB': GaussianNB(),
'GradientBoosting_Classifier': GradientBoostingClassifier(random_state=42),
'AdaBoost_Classifier': AdaBoostClassifier(random_state=42),
'RandomForest_Classifier': RandomForestClassifier(random_state=42),
'DecisionTree_Classifier': DecisionTreeClassifier(random_state=42),
}
param_details={
'KNN_Classifier': {'n_neighbors': np.arange(1,int(np.sqrt(len(X_train)))).tolist(), 'p': [1,2]},
'SVM_Classifier': {'C':[0.1,1,10,100],'gamma':[1,0.1,0.001], 'kernel':['linear','rbf']},
'XGB_Classifier': {'learning_rate':[0.05,0.10,0.15],'max_depth':[3,4,5,6],'min_child_weight':[1,3,5],'gamma':[0.1,0.2,0.3],'colsample_bytree':[0.3,0.4,0.5],'n_estimators':[20,40,50,60]},
'GaussianNB': {'var_smoothing': np.logspace(0,-9, num=100)},
'GradientBoosting_Classifier': {'learning_rate': [0.05,0.10,0.15],'min_samples_split': np.linspace(0.1,0.5,6),'min_samples_leaf': np.linspace(0.1,0.5,6),'max_depth':[3,5,8],'max_features':['log2','sqrt'],'subsample':[0.5,1.0],'n_estimators':[20,40,50,60]},
'AdaBoost_Classifier': {'n_estimators': [20,40,50,60],'learning_rate': [0.05,0.10,0.15]},
'RandomForest_Classifier': {'n_estimators': [20,40,50,60],'max_depth' : [4,5,6,7,8],'criterion' :['gini','entropy']},
'DecisionTree_Classifier': {'ccp_alpha': [0.1,.01,.001],'max_depth': [5, 6, 7, 8, 9],'criterion' :['gini', 'entropy']},
}
for key in model_details.keys():
print("Running GridSearchCV for %s." % key)
grid_search = GridSearchCV(model_details.get(key), param_details.get(key), cv=3, n_jobs=3, refit=True)
grid_search.fit(X_train,y_train)
predicted_model=grid_search.best_estimator_
predicted_model.fit(X_train,y_train)
y_pred = predicted_model.predict(X_test)
y_train_pred = predicted_model.predict(X_train)
if is_regression:
train_model_lists.append([key, predicted_model.score(X_train,y_train), predicted_model.score(X_test,y_test),
mean_absolute_error(y_test,y_pred), mean_squared_error(y_test,y_pred), np.sqrt(mean_squared_error(y_test,y_pred)), r2_score(y_test, y_pred), predicted_model])
else:
train_model_lists.append([key, accuracy_score(y_train,y_train_pred), accuracy_score(y_test, y_pred),
roc_auc_score(y_test,y_pred), precision_score(y_test,y_pred), recall_score(y_test,y_pred), f1_score(y_test,y_pred), predicted_model])
print("GridSearchCV for %s completed." % key)
return train_model_lists
def predict_best_model(train_model_lists, is_regression):
"""
predict_best_model function compares differnet models over metrics and return best fit model.
:param train_model_lists: list of trained models with metrics
:param is_regression: model to be trained as regressor or classifier
:return best predicted model object
"""
print('Different models with metrics')
model_df=None
if is_regression:
model_df=pd.DataFrame(train_model_lists,columns=['Model_Name','Train_Accuracy','Test_Accuracy','MAE','MSE','RMSE','R2_Score', 'Model']).sort_values(by=['R2_Score'], ascending=False)
else:
model_df=pd.DataFrame(train_model_lists,columns=['Model_Name','Train_Accuracy','Test_Accuracy','ROC_AUC','Precision','Recall','F1 Score', 'Model']).sort_values(by=['Recall','F1 Score'], ascending=False)
print(model_df)
return model_df.head(1).Model.values[0]
def process(filename, object_numeric_names, pie_col_names, replace_struct, cat_numeric_name, one_hot_encoder_name, standardization_col_name, feature_names, is_regression, target_value, test_size_percent, names=None, delimiter=',', usecol=None):
"""
process function contains differnet types of sub process which at end provides best fit model.
:param filename: take name of file as input, ex: 'credit.csv'
:param object_numeric_names: all column names that need to be converted to numeric dtype
:param pie_col_names: names of column that need to be represented in pie chart
:param replace_struct: user defined structure to update values in dataset, ex: {
"checking_balance":{"< 0 DM":1,"1-200 DM":2,"> 200 DM":3},"job":{"unemployed":1,"unskilled":2,"skilled":3} }
:param cat_numeric_name: names of column that needed to be converted from category type to numeric type
:param one_hot_encoder_name: names of column that convert categorical variable into dummy/indicator variables.
:param standardization_col_name: names of column that standardize features by removing the mean and scaling to unit variance.
:param feature_names: names of column that need to be removed.
:param is_regression: model to be trained as regressor or classifier
:param target_value: target output
:param test_size_percent: the proportion of the dataset to include in the test split.
:param names: if file didn't contain any column names or if user want to provide different columns, ex: ['purpose','amount','age','default']
:param delimiter: by default .csv file has comma(,) as separator, but user can provide different separator also, ex: ','
:param usecol: param you can select columns to load from the CSV file, ex: ['age','default']
:return best predicted model object
"""
df1=load_read_dataset(filename, names, delimiter, usecol)
df1=pre_processing(df1, object_numeric_names)
visualization(df1, pie_col_names)
df1=data_conversion(df1, replace_struct, cat_numeric_name, one_hot_encoder_name, standardization_col_name)
features_removal(df1, feature_names)
train_model_lists=train_model(df1, is_regression, target_value, test_size_percent)
return predict_best_model(train_model_lists, is_regression)
# Classification
model=process('TelcomCustomer-Churn_2.csv',['TotalCharges'],['OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod'],None,['PaperlessBilling','Churn'],['OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaymentMethod'],['MonthlyCharges','TotalCharges'],['customerID'],False,'Churn',20)
print(model)
# create an iterator object with write permission - model.pkl
with open('model_pkl', 'wb') as files:
pickle.dump(model, files)
Reading dataset filename:TelcomCustomer-Churn_2.csv, names:None, delimiter:,, usecol:None
Dataset have 7043 entries with 12 features.
Dataframe Information:
customerID object
OnlineBackup object
DeviceProtection object
TechSupport object
StreamingTV object
StreamingMovies object
Contract object
PaperlessBilling object
PaymentMethod object
MonthlyCharges float64
TotalCharges object
Churn object
dtype: object
5 Sample set from dataframe:
customerID OnlineBackup DeviceProtection \
3422 1837-YQUCE No Yes
5362 0485-ZBSLN No internet service No internet service
6821 8784-CGILN No No
3188 0587-DMGBH No No
2135 4010-YLMVT Yes No
TechSupport StreamingTV StreamingMovies \
3422 Yes Yes Yes
5362 No internet service No internet service No internet service
6821 Yes Yes Yes
3188 No No No
2135 No Yes Yes
Contract PaperlessBilling PaymentMethod \
3422 Two year Yes Bank transfer (automatic)
5362 Two year No Credit card (automatic)
6821 Month-to-month Yes Bank transfer (automatic)
3188 Month-to-month Yes Electronic check
2135 Month-to-month Yes Credit card (automatic)
MonthlyCharges TotalCharges Churn
3422 58.35 4214.25 No
5362 24.75 1715.1 No
6821 99.85 1776.95 Yes
3188 49.85 365.55 Yes
2135 106.60 5893.95 No
Dataset loaded successfully.
Pre-processing analysis started
Duplicate check analysis started.
Duplicate check analysis completed.
Empty data check analysis started.
Empty data check analysis completed.
Object to numeric conversion started
Object to numeric conversion completed
Outlier check analysis started.
Outlier check analysis completed
Pre-processing analysis completed
Data Visualization with different features.
Pie Chart Visualization
Pair-Plot Visualization
HeatMap Visualization
Data conversion on the basis of LabelEncoder or OneHotEncoder or user defined structure
Data conversion completed.
Eliminating unwanted features for model training
Running GridSearchCV for KNN_Classifier.
GridSearchCV for KNN_Classifier completed.
Running GridSearchCV for SVM_Classifier.
GridSearchCV for SVM_Classifier completed.
Running GridSearchCV for XGB_Classifier.
GridSearchCV for XGB_Classifier completed.
Running GridSearchCV for GaussianNB.
GridSearchCV for GaussianNB completed.
Running GridSearchCV for GradientBoosting_Classifier.
GridSearchCV for GradientBoosting_Classifier completed.
Running GridSearchCV for AdaBoost_Classifier.
GridSearchCV for AdaBoost_Classifier completed.
Running GridSearchCV for RandomForest_Classifier.
GridSearchCV for RandomForest_Classifier completed.
Running GridSearchCV for DecisionTree_Classifier.
GridSearchCV for DecisionTree_Classifier completed.
Different models with metrics
Model_Name Train_Accuracy Test_Accuracy ROC_AUC \
3 GaussianNB 0.749577 0.755072 0.753518
1 SVM_Classifier 0.839937 0.813043 0.812375
6 RandomForest_Classifier 0.811186 0.795652 0.794759
2 XGB_Classifier 0.848273 0.807729 0.807028
0 KNN_Classifier 0.996497 0.812560 0.812108
5 AdaBoost_Classifier 0.752839 0.758454 0.757482
4 GradientBoosting_Classifier 0.766973 0.780676 0.780048
7 DecisionTree_Classifier 0.779778 0.779710 0.779395
Precision Recall F1 Score \
3 0.711719 0.868446 0.782310
1 0.788831 0.861773 0.823690
6 0.765254 0.860820 0.810229
2 0.782798 0.858913 0.819091
0 0.796945 0.845567 0.820537
5 0.730479 0.829361 0.776786
4 0.761194 0.826501 0.792505
7 0.771769 0.802669 0.786916
Model
3 GaussianNB(var_smoothing=0.0012328467394420659)
1 SVC(C=100, gamma=0.1, random_state=42)
6 (DecisionTreeClassifier(max_depth=8, max_featu...
2 XGBClassifier(base_score=0.5, booster='gbtree'...
0 KNeighborsClassifier(n_neighbors=1)
5 (DecisionTreeClassifier(max_depth=1, random_st...
4 ([DecisionTreeRegressor(criterion='friedman_ms...
7 DecisionTreeClassifier(ccp_alpha=0.001, max_de...
GaussianNB(var_smoothing=0.0012328467394420659)
with open('model_pkl' , 'rb') as f:
pickle_model = pickle.load(f)
print(pickle_model)
GaussianNB(var_smoothing=0.0012328467394420659)